Script to reproduce years based on a model trained with a specific year¶

Importing¶

In [ ]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import salishsea_tools.viz_tools as sa_vi

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import root_mean_squared_error as rmse

from tqdm.auto import tqdm
import random

Datasets Preparation¶

In [ ]:
def datasets_preparation(dataset):
    
    drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)'])])
    indx = np.where(~np.isnan(drivers).any(axis=0))
    drivers = drivers[:,indx[0]]

    diat = np.ravel(dataset['Diatom'])
    diat = diat[indx[0]]

    return(drivers, diat, indx)

Regressor¶

In [ ]:
def regressor (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)
    X_train, _, y_train, _ = train_test_split(inputs2, targets)

    model = MLPRegressor(hidden_layer_sizes=200)
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(X_train, y_train)

    return (regr)

Regressor for Other Years (Annually)¶

In [ ]:
def regressor2 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    m = scatter_plot(targets, outputs_test, variable_name) 
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor for Other Years (Daily)¶

In [ ]:
def regressor3 (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs_test, deg=1)
    
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor for Individual Days¶

In [ ]:
def regressor4 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs = regr.predict(inputs2)

    # Post processing
    indx2 = np.full((len(diat_i.y)*len(diat_i.x)),np.nan)
    indx2[indx[0]] = outputs
    model = np.reshape(indx2,(len(diat_i.y),len(diat_i.x)))

    m = scatter_plot(targets, outputs, variable_name + str(dates[i].date())) 

    # Preparation of the dataarray 
    model = xr.DataArray(model,
        coords = {'y': diat_i.y, 'x': diat_i.x},
        dims = ['y','x'],
        attrs=dict( long_name = variable_name + "Concentration",
        units="mmol m-2"),)
                        
    plotting3(targets, model, diat_i, variable_name)

Printing¶

In [ ]:
def printing (targets, outputs, m):

    print ('The amount of data points is', outputs.size)
    print ('The slope of the best fitting line is ', np.round(m,3))
    print ('The correlation coefficient is:', np.round(np.corrcoef(targets, outputs)[0][1],3))
    print (' The root mean square error is:', rmse(targets,outputs))

Scatter Plot¶

In [ ]:
def scatter_plot(targets, outputs, variable_name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs, deg=1)

    printing(targets, outputs, m)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,outputs, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('outputs')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,outputs, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('outputs')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(variable_name)

    plt.show()

    return (m)

Plotting¶

In [ ]:
def plotting (variable, name):

    plt.plot(years,variable, marker = '.', linestyle = '')
    plt.xlabel('Years')
    plt.ylabel(name)
    plt.show()

Plotting 2¶

In [ ]:
def plotting2(variable,title):
    
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=pd.DatetimeIndex(dates).month)

    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
    fig.suptitle('Daily ' + title + ' (15 Feb - 30 Apr)')
    
    fig.show()

Plotting 3¶

In [ ]:
def plotting3(targets, model, variable, variable_name):

    fig, ax = plt.subplots(2,2, figsize = (10,15))

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    variable.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax =targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    model.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    ((variable-model) / variable * 100).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': variable_name + ' Concentration  [percentage]'})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])


    ax[0,0].title.set_text(variable_name + ' (targets)')
    ax[0,1].title.set_text(variable_name + ' (outputs)')
    ax[1,0].title.set_text('targets - outputs')
    ax[1,1].axis('off')

    fig.suptitle(str(dates[i].date()))

    plt.show()
    

Training with the Selected Year¶

In [ ]:
# Dataset and date
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
dates = pd.DatetimeIndex(ds['time_counter'].values)

# just an example
year = 2007
    
dataset = ds.sel(time_counter=str(year))

drivers, diat, _ = datasets_preparation(dataset)

regr = regressor(drivers, diat)

Other Years (Anually)¶

In [ ]:
years = range (2007,2024)

r_all = []
rms_all = []
slope_all = []

for year in tqdm(range (2007,2024)):
    
    dataset = ds.sel(time_counter=str(year))
    
    drivers, diat, _ = datasets_preparation(dataset)

    r, rms, m = regressor2(drivers, diat, 'Diatom ' + str(year))
    
    r_all.append(r)
    rms_all.append(rms)
    slope_all.append(m)
    
plotting(np.transpose(r_all), 'Correlation Coefficient')
plotting(np.transpose(rms_all), 'Root Mean Square Error')
plotting (np.transpose(slope_all), 'Slope of the best fitting line')
  0%|          | 0/17 [00:00<?, ?it/s]
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.486
The correlation coefficient is: 0.875
 The root mean square error is: 0.07810872
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3532404
The slope of the best fitting line is  0.47
The correlation coefficient is: 0.637
 The root mean square error is: 0.12396264
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.402
The correlation coefficient is: 0.588
 The root mean square error is: 0.16719967
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.474
The correlation coefficient is: 0.499
 The root mean square error is: 0.15231907
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.519
The correlation coefficient is: 0.565
 The root mean square error is: 0.14611396
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3532404
The slope of the best fitting line is  0.457
The correlation coefficient is: 0.644
 The root mean square error is: 0.12551421
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.44
The correlation coefficient is: 0.59
 The root mean square error is: 0.14834361
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.455
The correlation coefficient is: 0.568
 The root mean square error is: 0.14273798
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.412
The correlation coefficient is: 0.205
 The root mean square error is: 0.2080702
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3532404
The slope of the best fitting line is  0.417
The correlation coefficient is: 0.443
 The root mean square error is: 0.17821921
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.484
The correlation coefficient is: 0.611
 The root mean square error is: 0.12873062
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.43
The correlation coefficient is: 0.196
 The root mean square error is: 0.1972452
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.419
The correlation coefficient is: 0.291
 The root mean square error is: 0.1975733
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3532404
The slope of the best fitting line is  0.394
The correlation coefficient is: 0.302
 The root mean square error is: 0.21968317
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.465
The correlation coefficient is: 0.618
 The root mean square error is: 0.14783666
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.495
The correlation coefficient is: 0.566
 The root mean square error is: 0.13722642
No description has been provided for this image
/tmp/ipykernel_528/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned
  m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925
The slope of the best fitting line is  0.506
The correlation coefficient is: 0.477
 The root mean square error is: 0.16018644
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Other Years (Daily)¶

In [ ]:
r_all2 = np.array([])
rms_all2 = np.array([])
slope_all2 = np.array([])

for i in tqdm(range (0, len(ds.time_counter))):
    
    dataset = ds.isel(time_counter=i)

    drivers, diat, _ = datasets_preparation(dataset)

    r, rms, m = regressor3(drivers, diat)

    r_all2 = np.append(r_all2,r)
    rms_all2 = np.append(rms_all2,rms)
    slope_all2 = np.append(slope_all2,m)

plotting2(r_all2, 'Correlation Coefficients')
plotting2(rms_all2, 'Root Mean Square Errors')
plotting2(slope_all2, 'Slope of the best fitting line')
  0%|          | 0/1279 [00:00<?, ?it/s]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Daily Maps¶

In [ ]:
maps = random.sample(range(0,len(ds.time_counter)),10)

for i in tqdm(maps):

    dataset = ds.isel(time_counter=i)
    drivers, diat, indx = datasets_preparation(dataset)

    diat_i = dataset['Diatom']

    regressor4(drivers, diat, 'Diatom ')
  0%|          | 0/10 [00:00<?, ?it/s]
The amount of data points is 46479
The slope of the best fitting line is  0.046
The correlation coefficient is: 0.019
 The root mean square error is: 0.195463
No description has been provided for this image
No description has been provided for this image
The amount of data points is 46479
The slope of the best fitting line is  0.349
The correlation coefficient is: 0.104
 The root mean square error is: 0.2161055
No description has been provided for this image
No description has been provided for this image
The amount of data points is 46479
The slope of the best fitting line is  0.725
The correlation coefficient is: 0.259
 The root mean square error is: 0.1806715
No description has been provided for this image
No description has been provided for this image
The amount of data points is 46479
The slope of the best fitting line is  0.029
The correlation coefficient is: 0.013
 The root mean square error is: 0.17164007
No description has been provided for this image
No description has been provided for this image
The amount of data points is 46479
The slope of the best fitting line is  -0.282
The correlation coefficient is: -0.109
 The root mean square error is: 0.17115316
No description has been provided for this image
No description has been provided for this image
The amount of data points is 46479
The slope of the best fitting line is  1.079
The correlation coefficient is: 0.441
 The root mean square error is: 0.17788132
No description has been provided for this image
No description has been provided for this image
The amount of data points is 46479
The slope of the best fitting line is  0.092
The correlation coefficient is: 0.033
 The root mean square error is: 0.16951112
No description has been provided for this image
No description has been provided for this image
The amount of data points is 46479
The slope of the best fitting line is  0.919
The correlation coefficient is: 0.33
 The root mean square error is: 0.16843781
No description has been provided for this image
No description has been provided for this image
The amount of data points is 46479
The slope of the best fitting line is  0.382
The correlation coefficient is: 0.15
 The root mean square error is: 0.19157225
No description has been provided for this image
No description has been provided for this image
The amount of data points is 46479
The slope of the best fitting line is  -0.384
The correlation coefficient is: -0.273
 The root mean square error is: 0.25859556
No description has been provided for this image
No description has been provided for this image
In [ ]: